library(ggplot2)
library(MuMIn)
library(sjPlot)
library(reshape2)
library(plyr)
library(effects)
library(ggeffects)
library(MASS)
library(ordinal)
library(QuantPsyc)
library(BBmisc)
library(ggExtra)
library(standardize)
library(lme4)
library(mgcv)
library(itsadug)
library(tidyverse)
library(dplyr)
library(tidyr)
library(hrbrthemes)

sessionInfo()
# R version 4.5.0 (2025-04-11)
# Platform: aarch64-apple-darwin20
# Running under: macOS Sequoia 15.4.1
# 
# Matrix products: default
# BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
# LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
# 
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] hrbrthemes_0.8.7    lubridate_1.9.4     forcats_1.0.0       stringr_1.5.1       readr_2.1.5         tidyr_1.3.1         tibble_3.2.1        tidyverse_2.0.0     itsadug_2.4.1      
# [10] plotfunctions_1.4   mgcv_1.9-1          nlme_3.1-168        lme4_1.1-37         Matrix_1.7-3        standardize_0.2.2   ggExtra_0.10.1      BBmisc_1.13         QuantPsyc_1.6      
# [19] purrr_1.0.4         dplyr_1.1.4         boot_1.3-31         ordinal_2023.12-4.1 MASS_7.3-65         ggeffects_2.2.1     effects_4.2-2       carData_3.0-5       plyr_1.8.9         
# [28] reshape2_1.4.4      sjPlot_2.8.17       MuMIn_1.48.11       ggplot2_3.5.2      
# 
# loaded via a namespace (and not attached):
# [1] sjlabelled_1.2.0        tidyselect_1.2.1        farver_2.1.2            fastmap_1.2.0           fontquiver_0.2.1        promises_1.3.2          sjstats_0.19.0          digest_0.6.37          
# [9] timechange_0.3.0        mime_0.13               lifecycle_1.0.4         survival_3.8-3          magrittr_2.0.3          compiler_4.5.0          rlang_1.1.6             tools_4.5.0            
# [17] data.table_1.17.2       knitr_1.50              RColorBrewer_1.1-3      miniUI_0.1.2            withr_3.0.2             numDeriv_2016.8-1.1     nnet_7.3-20             grid_4.5.0             
# [25] datawizard_1.1.0        stats4_4.5.0            gdtools_0.4.2           xtable_1.8-4            colorspace_2.1-1        extrafontdb_1.0         scales_1.4.0            insight_1.2.0          
# [33] cli_3.6.5               survey_4.4-2            reformulas_0.4.1        generics_0.1.4          rstudioapi_0.17.1       performance_0.13.0      tzdb_0.5.0              minqa_1.2.8            
# [41] DBI_1.2.3               splines_4.5.0           mitools_2.4             vctrs_0.6.5             fontBitstreamVera_0.1.1 hms_1.1.3               systemfonts_1.2.3       glue_1.8.0             
# [49] nloptr_2.2.1            stringi_1.8.7           gtable_0.3.6            later_1.4.2             extrafont_0.19          pillar_1.10.2           htmltools_0.5.8.1       R6_2.6.1               
# [57] ucminf_1.2.2            Rdpack_2.6.4            evaluate_1.0.3          shiny_1.10.0            lattice_0.22-6          rbibutils_2.3           backports_1.5.0         fontLiberation_0.1.0   
# [65] httpuv_1.6.16           Rcpp_1.0.14             Rttf2pt1_1.3.12         checkmate_2.3.2         xfun_0.52               sjmisc_2.8.10           pkgconfig_2.0.3   


##### function definition #####
# function to unit-normalize a numeric variable
unit <- function(x) {
  (x-min(x))/(max(x)-min(x))
}

# function to box-cox transform a df column - depends on MASS
box_cox_transf <- function(x, min, max) {
  bc = boxcox(x ~ 1, lambda = seq(min, max, 0.1), plotit = FALSE)
  df = data.frame(bc$x, bc$y)
  df2 = df[with(df, order(-df$bc.y)),]
  lambda = df2[1, "bc.x"]
  print(lambda)
  if (lambda != 0) {
    x.transf = (x ^ lambda - 1)/lambda
  } else {
    x.transf = log(x)
  }
  return(x.transf)
}


##### read data and set variable types ####

df = read.csv("data/processed/pwdprotect_candidates_dummies_2.csv", header = T) 

df$district = factor(df$district)
df$pictureAvailable = factor(df$pictureAvailable)
df$pictureAvailable = mapvalues(df$pictureAvailable, from = c(0, 1), to = c('yes', 'no'))
df$age.c = df$age - median(df$age) 
df$onTheBallot.b = ifelse(df$onTheBallot == "no", "no", "yes")
df$electedToParliament.b = ifelse(df$electedToParliament == "no", "no", "yes")
df$onTheBallot.b = factor(df$onTheBallot.b)
df$electedToParliament.b = factor(df$electedToParliament.b)
df$in_blog = factor(df$in_blog)
df$in_blog = mapvalues(df$in_blog, from = c(0, 1), to = c('no', 'yes'))
df$in_facebook = factor(df$in_facebook)
df$in_facebook = mapvalues(df$in_facebook, from = c(0, 1), to = c('no', 'yes'))
df$in_forum = factor(df$in_forum)
df$in_forum = mapvalues(df$in_forum, from = c(0, 1), to = c('no', 'yes'))
df$in_meetup = factor(df$in_meetup)
df$in_meetup = mapvalues(df$in_meetup, from = c(0, 1), to = c('no', 'yes'))
df$already_candidate = factor(df$already_candidate)
df$already_candidate = mapvalues(df$already_candidate, from = c(0, 1), to = c('no', 'yes'))
df$all_platforms = factor(df$all_platforms)
df$all_platforms = mapvalues(df$all_platforms, from = c(0, 1), to = c('no', 'yes'))

# drop the smallest and largest district to exclude outliers
df = df[df$district != 27 & df$district != 28, ]


##### scale variables #####
# unit scale screen position and rank, and derive district size
unit_scaled_screen = with(df, tapply(screenPosition, district, FUN=unit))
unit_scaled_rank = with(df, tapply(rank, district, FUN=unit))

district_size = with(df, tapply(rank, district, FUN=max))
med.age = with(df, tapply(age, district, FUN=median))
sex.perc = df %>%
  group_by(district) %>%
  dplyr::summarise(f.perc = sum(sex == 'f', na.rm=T) / n())
pic.perc = df %>%
  group_by(district) %>%
  dplyr::mutate(logo = replace_na(logo,2)) %>%
  dplyr::summarise(logo.perc = sum(logo == 1, na.rm=T) / n())

for (i in 1:length(unit_scaled_screen)) {
  for (j in 1:length(unit_scaled_screen[[i]])) {
    df[df$district == i & df$rank == j, "screenPosition.unit"] = unit_scaled_screen[[i]][j]
    df[df$district == i & df$rank == j, "rank.unit"] = unit_scaled_rank[[i]][j]
    df[df$district == i & df$rank == j, 'district.size'] = district_size[[i]]
    df[df$district == i & df$rank == j, 'age.district.centered'] = df[df$district == i & df$rank == j, 'age'] - med.age[[i]]
    df[df$district == i & df$rank == j, 'f.perc'] = as.numeric(sex.perc[sex.perc$district == i, 'f.perc'])
    df[df$district == i & df$rank == j, 'logo.perc'] = as.numeric(pic.perc[pic.perc$district == i, 'logo.perc'])
  }
}

rm(i, district_size, unit_scaled_rank, unit_scaled_screen, pic.perc, sex.perc, med.age, j)

# make sure all variables have the right data type
df$logo = mapvalues(df$logo, from = c('0', '1'), to = c('no_logo', 'logo'))
df$logo[is.na(df$logo)] <- 'no_pic'
df$logo = factor(df$logo)
df$sex = mapvalues(factor(df$sex, levels = c('m', 'f')), from = c('m', 'f'), to = c('m', 'w'))
df$district.size.log = log2(df$district.size)


##### merge with other dfs #####
# centralities by platform
platformCentralities_df = read.csv('data/processed/pwdprotect_candidates_platforms_centralities.csv', sep = ',', header = T)
df = merge(
  dplyr::select(df, rank.unit, age.district.centered, screenPosition.unit, sex, logo, district.size.log, f.perc, logo.perc,
                candidate_ID, already_candidate, in_forum, in_meetup, in_blog, in_facebook, all_platforms, district),
  platformCentralities_df, on = 'candidate_ID')


# network measures for entities
userloc_df = read.csv('data/processed/pwdprotect_entity_userloc_giant_component_proj.metrics.csv', header = T, sep = ',')
colnames(userloc_df) <- paste(colnames(userloc_df), "userloc", sep = "_")
tmp1 = merge(df, userloc_df, by.x = 'candidate_ID', by.y = 'name_userloc', all.x = T)

userorg_df = read.csv('data/processed/pwdprotect_entity_userorg_giant_component_proj.metrics.csv', header = T, sep = ',')
colnames(userorg_df) <- paste(colnames(userorg_df), "userorg", sep = "_")
tmp2 = merge(tmp1, userorg_df, by.x = 'candidate_ID', by.y = 'name_userorg', all.x = T)

userper_df = read.csv('data/processed/pwdprotect_entity_userper_giant_component_proj.metrics.csv', header = T, sep = ',')
colnames(userper_df) <- paste(colnames(userper_df), "userper", sep = "_")
tmp3 = merge(tmp2, userper_df, by.x = 'candidate_ID', by.y = 'name_userper', all.x = T)

userurl_df = read.csv('data/processed/pwdprotect_entity_userurl_giant_component_proj_late_2012.metrics.csv', header = T, sep = ',')
colnames(userurl_df) <- paste(colnames(userurl_df), "userurl", sep = "_")
tmp4 = merge(tmp3, userurl_df, by.x = 'candidate_ID', by.y = 'name_userurl', all.x = T)

user_giantComp_df = read.csv('data/processed/pwdprotect_user_giant_component.metrics.csv', header = T, sep = ',')
colnames(user_giantComp_df) <- paste(colnames(user_giantComp_df), "usergiant", sep = "_")
df = merge(tmp4, user_giantComp_df, by.x = 'candidate_ID', by.y = 'name_usergiant', all.x = T)
rm(tmp1, tmp2, tmp3, tmp4)

# topics
ctmtopic_gini_df = read.csv('data/processed/pwdprotect_candidates_topics_GINI_cosine.csv', header = T, sep = ',')
tmp1 = merge(df, ctmtopic_gini_df, by = 'candidate_ID', all.x = T)

# used for a robustness check
#berttopic_gini_df = read.csv('data/processed/pwdprotect_BERTopics_topics_cosine_GINI.csv', header = T, sep = ',')
#tmp1 = merge(df, berttopic_gini_df, by.x = 'candidate_ID', by.y = 'name', all.x = T)

topic_BT_df = read.csv('data/processed/pwdprotect_topic_BT_usertopic_unproj_and_giant_component_proj.metrics.csv', header = T, sep = ',')
colnames(topic_BT_df) <- paste(colnames(topic_BT_df), "BT", sep = "_")
tmp2 = merge(tmp1, topic_BT_df, by.x = 'candidate_ID', by.y = 'name_BT', all.x = T)

topic_LT_df = read.csv('data/processed/pwdprotect_topic_LT_usertopic_unproj_and_giant_component_proj.metrics.csv', header = T, sep = ',')
colnames(topic_LT_df) <- paste(colnames(topic_LT_df), "LT", sep = "_")
df = merge(tmp2, topic_LT_df, by.x = 'candidate_ID', by.y = 'name_LT', all.x = T)
rm(tmp1, tmp2)

# vanity metrics
vanity_df = read.csv('data/processed/pwdprotect_candidates_vanity.csv', sep = ',', header = T)
df = merge(df, vanity_df, by = 'candidate_ID', all.x = T)
df = df %>% distinct()

library(randomForest)  # version: randomForest_4.7-1.2

set.seed(111) # uncomment if you want to generate the exact numbers reported in the paper

# baseline model with predictors relating to the interface
# all RFs fit with default values 
reps = 100
seeds = sample(1:10000, reps)
t.score = qt(p=0.01/2, df=reps-1, lower.tail=F)

##### interface predictors #####
var_expl_interface = NULL
importance_interface = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.interface = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc,
    data = df, importance = T, ntree = 500
  )
  var_expl_interface[i] = round(100 * model.interface$rsq[length(model.interface$rsq)], digits = 2)
  imp = as.data.frame(model.interface$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_interface = rbind(importance_interface,
                               imp)
}
rm(i, imp, model.interface)

avg_interface = mean(var_expl_interface)
sem_interface = sd(var_expl_interface) / sqrt(reps)
margin.error_interface <- t.score * sem_interface
print(c(avg_interface - margin.error_interface, avg_interface + margin.error_interface))
# 19.55481 19.67719

importance_interface$variable = factor(importance_interface$variable)
importance_interface %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - interface')



##### past elections predictors #####
# on top of the baseline model I'm adding predictors in batches that capture some phenomenon of interest
# here I start from offline recognition from elections
var_expl_offline = NULL
importance_offline = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.offline = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate,
    data = df, importance = T, ntree = 500
  )
  var_expl_offline[i] = round(100 * model.offline$rsq[length(model.offline$rsq)], digits = 2)
  imp = as.data.frame(model.offline$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_offline = rbind(importance_offline,
                             imp)
}
rm(i, imp, model.offline)

avg_offline = mean(var_expl_offline)
sem_offline = sd(var_expl_offline) / sqrt(reps)
margin.error_offline <- t.score * sem_offline
print(c(avg_offline - margin.error_offline, avg_offline + margin.error_offline))
# 22.12678 22.26402
t.test(var_expl_interface, var_expl_offline)
# t = -73.687, df = 195.46, p-value < 2.2e-16
importance_offline$variable = factor(importance_offline$variable)
importance_offline %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - previous candidacy')


##### online presence predictors #####
var_expl_onlinePresence = NULL
importance_onlinePresence = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.onlinePresence = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      in_forum + in_meetup + in_blog + in_facebook + all_platforms,
    data = df, importance = T, ntree = 500
  )
  var_expl_onlinePresence[i] = round(100 * model.onlinePresence$rsq[length(model.onlinePresence$rsq)], digits = 2)
  imp = as.data.frame(model.onlinePresence$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_onlinePresence = rbind(importance_onlinePresence,
                                    imp)
}
rm(i, imp, model.onlinePresence)

avg_onlinePresence = mean(var_expl_onlinePresence)
sem_onlinePresence = sd(var_expl_onlinePresence) / sqrt(reps)
margin.error_onlinePresence <- t.score * sem_onlinePresence
print(c(avg_onlinePresence - margin.error_onlinePresence, avg_onlinePresence + margin.error_onlinePresence))
# 24.45943 24.60957
t.test(var_expl_offline, var_expl_onlinePresence)
# -60.406, df = 196.42, p-value < 2.2e-16

importance_onlinePresence$variable = factor(importance_onlinePresence$variable)
importance_onlinePresence %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - online presence')




##### online centralities predictors #####
# convert centrality measures to tertiles per district mapping NAs to 0
centralities = colnames(df)[grepl('centrality', colnames(df), fixed = T)]
districts = unique(df$district)
for (col_name in centralities) {
  for (district_id in districts) {
    print(col_name)
    print(district_id)
    tertiles = quantile(df[df$district == district_id, col_name], probs = c(0.33, 0.67), na.rm = T)
    print(tertiles)
    df[df$district == district_id & is.na(df[,col_name]),col_name] = 0
    if (sum(df[df$district == district_id, col_name]) != 0) {
      df[df$district == district_id & df[,col_name] > tertiles[2],col_name] = 3
      df[df$district == district_id & df[,col_name] > tertiles[1] & df[,col_name] <= tertiles[2],col_name] = 2
      df[df$district == district_id & df[,col_name] > 0 & df[,col_name] <= tertiles[1],col_name] = 1
    } else {
    }
  }
  df[,col_name] = ordered(
    df[,col_name], levels=c(0, 1, 2, 3),
    labels=c("not present", "lower third", "mid third", "upper third"))
}

var_expl_platformCentrality = NULL
importance_platformCentrality = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.platformCentrality = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality,
    data = df, importance = T, ntree = 500
  )
  var_expl_platformCentrality[i] = round(100 * model.platformCentrality$rsq[length(model.platformCentrality$rsq)], digits = 2)
  imp = as.data.frame(model.platformCentrality$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_platformCentrality = rbind(importance_platformCentrality,
                                        imp)
}
rm(i, imp, model.platformCentrality)

avg_platformCentrality = mean(var_expl_platformCentrality)
sem_platformCentrality = sd(var_expl_platformCentrality) / sqrt(reps)
margin.error_platformCentrality <- t.score * sem_platformCentrality
print(c(avg_platformCentrality - margin.error_platformCentrality, avg_platformCentrality + margin.error_platformCentrality))
# 25.26757 25.40843
t.test(var_expl_onlinePresence, var_expl_platformCentrality)
# -20.502, df = 197.2, p-value < 2.2e-16
importance_platformCentrality$variable = factor(importance_platformCentrality$variable)
importance_platformCentrality %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - platform centralities')



##### network measures #####

# userloc_df[userloc_df$name_userloc == 'beppe grillo', ]  # in community 2
df$communities_userloc_leader = ifelse(df$communities_userloc == 2, 'yes', 'no')
df$communities_userloc_leader[is.na(df$communities_userloc_leader)] <- 'not_present'

# userper_df[userper_df$name_userper == 'beppe grillo', ]  # in community 2
df$communities_userper_leader = ifelse(df$communities_userper == 2, 'yes', 'no')
df$communities_userper_leader[is.na(df$communities_userper_leader)] <- 'not_present'

# userorg_df[userorg_df$name_userorg == 'beppe grillo', ]  # in community 3
df$communities_userorg_leader = ifelse(df$communities_userorg == 3, 'yes', 'no')
df$communities_userorg_leader[is.na(df$communities_userorg_leader)] <- 'not_present'

# userurl_df[userurl_df$name_userurl == 'beppe grillo', ]  # in community 6
df$communities_userurl_leader = ifelse(df$communities_userurl == 6, 'yes', 'no')
df$communities_userurl_leader[is.na(df$communities_userurl_leader)] <- 'not_present'

# user_giantComp_df[user_giantComp_df$name_usergiant == 'beppe grillo', ]  # in community 2
df$communities_usergiant_leader = ifelse(df$communities_usergiant == 2, 'yes', 'no')
df$communities_usergiant_leader[is.na(df$communities_usergiant_leader)] <- 'not_present'

cols_giantComp = colnames(df)[grepl('usergiant', colnames(df), fixed = T)]
cols_userloc = colnames(df)[grepl('userloc', colnames(df), fixed = T)]
cols_userorg = colnames(df)[grepl('userorg', colnames(df), fixed = T)]
cols_userper = colnames(df)[grepl('userper', colnames(df), fixed = T)]
cols_userurl = colnames(df)[grepl('userurl', colnames(df), fixed = T)]
cols_network = c(cols_giantComp, cols_userloc, cols_userorg, cols_userper, cols_userurl)
cols_network = cols_network[!grepl('centrality', cols_network, fixed = T)]
cols_network = cols_network[!grepl('communities', cols_network, fixed = T)]
districts = unique(df$district)
for (col_name in cols_network) {
  for (district_id in districts) {
    print(col_name)
    print(district_id)
    tertiles = quantile(df[df$district == district_id, col_name], probs = c(0.33, 0.67), na.rm = T)
    print(tertiles)
    df[df$district == district_id & is.na(df[,col_name]),col_name] = 0
    if (sum(df[df$district == district_id, col_name]) != 0) {
      df[df$district == district_id & df[,col_name] > tertiles[2],col_name] = 3
      df[df$district == district_id & df[,col_name] > tertiles[1] & df[,col_name] <= tertiles[2],col_name] = 2
      df[df$district == district_id & df[,col_name] > 0 & df[,col_name] <= tertiles[1],col_name] = 1
    } else {
    }
  }
  df[,col_name] = ordered(
    df[,col_name], levels=c(0, 1, 2, 3),
    labels=c("not present", "lower third", "mid third", "upper third"))
}

# ----- user-loc network ----
# this block of predictors improves variance explained over platform centralities
var_expl_userloc = NULL
importance_userloc = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.userloc = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality +
      betweenness_userloc + closeness_userloc + eigen_centrality_userloc + degree_userloc + page_rank_userloc + dist_leader_userloc + 
      communities_userloc_leader,
    data = df, importance = T, ntree = 500
  )
  var_expl_userloc[i] = round(100 * model.userloc$rsq[length(model.userloc$rsq)], digits = 2)
  imp = as.data.frame(model.userloc$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_userloc = rbind(importance_userloc,
                             imp)
}
rm(i, imp, model.userloc)

avg_userloc = mean(var_expl_userloc)
sem_userloc = sd(var_expl_userloc) / sqrt(reps)
margin.error_userloc <- t.score * sem_userloc
print(c(avg_userloc - margin.error_userloc, avg_userloc + margin.error_userloc))
# 25.71605 25.86615
t.test(var_expl_platformCentrality, var_expl_userloc)
# -11.562, df = 197.21, p-value < 2.2e-16
importance_userloc$variable = factor(importance_userloc$variable)
importance_userloc %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - user:loc network measures')

# ----- user-org network ----
# this block of predictors doesn't improve variance explained over platform centralities
var_expl_userorg = NULL
importance_userorg = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.userorg = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality +
      facebook_degree_centrality + facebook_eigenvector_centrality + facebook_in_degree_centrality + facebook_out_degree_centrality +
      communities_userorg_leader + betweenness_userorg + closeness_userorg + eigen_centrality_userorg + degree_userorg + 
      page_rank_userorg + dist_leader_userorg + communities_userorg_leader,
    data = df, importance = T, ntree = 500
  )
  var_expl_userorg[i] = round(100 * model.userorg$rsq[length(model.userorg$rsq)], digits = 2)
  imp = as.data.frame(model.userorg$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_userorg = rbind(importance_userorg,
                             imp)
}
rm(i, imp, model.userorg)

avg_userorg = mean(var_expl_userorg)
sem_userorg = sd(var_expl_userorg) / sqrt(reps)
margin.error_userorg <- t.score * sem_userorg
print(c(avg_userorg - margin.error_userorg, avg_userorg + margin.error_userorg))
# 25.3896 25.5424
t.test(var_expl_platformCentrality, var_expl_userorg)
# -3.2353, df = 196.71, p-value = 0.001425
importance_userorg$variable = factor(importance_userorg$variable)
importance_userorg %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - user:org network measures')


# ----- user-per network ----
# this block of predictors doesn't improve variance explained over user:loc network predictors
var_expl_userper = NULL
importance_userper = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.userper = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality +
      facebook_degree_centrality + facebook_eigenvector_centrality + facebook_in_degree_centrality + facebook_out_degree_centrality +
      betweenness_userper + closeness_userper + eigen_centrality_userper + degree_userper + page_rank_userper + dist_leader_userper + 
      communities_userper_leader +
      betweenness_userloc + closeness_userloc + eigen_centrality_userloc + degree_userloc + page_rank_userloc + dist_leader_userloc + 
      communities_userloc_leader,
    data = df, importance = T, ntree = 500
  )
  var_expl_userper[i] = round(100 * model.userper$rsq[length(model.userper$rsq)], digits = 2)
  imp = as.data.frame(model.userper$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_userper = rbind(importance_userper,
                             imp)
}
rm(i, imp, model.userper)

avg_userper = mean(var_expl_userper)
sem_userper = sd(var_expl_userper) / sqrt(reps)
margin.error_userper <- t.score * sem_userper
print(c(avg_userper - margin.error_userper, avg_userper + margin.error_userper))
# 25.67759 25.84921
t.test(var_expl_platformCentrality, var_expl_userper)
# -10.064, df = 190.75, p-value < 2.2e-16
importance_userper$variable = factor(importance_userper$variable)
importance_userper %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '')


# ----- user-url network ----
# this block of predictors does improve over user:loc network measures, which do not contribute anything and are thus dropped.
var_expl_userurl = NULL
importance_userurl = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.userurl = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality +
      betweenness_userurl + closeness_userurl + eigen_centrality_userurl + degree_userurl + page_rank_userurl + dist_leader_userurl + 
      communities_userurl_leader,
    data = df, importance = T, ntree = 500
  )
  var_expl_userurl[i] = round(100 * model.userurl$rsq[length(model.userurl$rsq)], digits = 2)
  imp = as.data.frame(model.userurl$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_userurl = rbind(importance_userurl,
                             imp)
}
rm(i, imp, model.userurl)

avg_userurl = mean(var_expl_userurl)
sem_userurl = sd(var_expl_userurl) / sqrt(reps)
margin.error_userurl <- t.score * sem_userurl
print(c(avg_userurl - margin.error_userurl, avg_userurl + margin.error_userurl))
# 25.96355 26.11465
t.test(var_expl_platformCentrality, var_expl_userurl)
# -17.827, df = 197.03, p-value < 2.2e-16
importance_userurl$variable = factor(importance_userurl$variable)
importance_userurl %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - user:url network measures')


# ----- giant comp network ----
# this block of predictors improves predictions over centrality measures, and measures from the user:url network do not further improve 
# predictions. we proceed with this block of predictors.
# this is the model which afford the best variance explained.
var_expl_usergiant = NULL
importance_usergiant = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.usergiant = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality +
      communities_usergiant_leader + betweenness_usergiant + closeness_usergiant + eigen_centrality_usergiant + degree_usergiant + 
      page_rank_usergiant + dist_leader_usergiant,
    data = df, importance = T, ntree = 500
  )
  var_expl_usergiant[i] = round(100 * model.usergiant$rsq[length(model.usergiant$rsq)], digits = 2)
  imp = as.data.frame(model.usergiant$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_usergiant = rbind(importance_usergiant,
                               imp)
}
rm(i, imp, model.usergiant)

avg_usergiant = mean(var_expl_usergiant)
sem_usergiant = sd(var_expl_usergiant) / sqrt(reps)
margin.error_usergiant <- t.score * sem_usergiant
print(c(avg_usergiant - margin.error_usergiant, avg_usergiant + margin.error_usergiant))
# 26.24922 26.40018
t.test(var_expl_platformCentrality, var_expl_usergiant)
# -25.103, df = 197.06, p-value < 2.2e-16
importance_usergiant$variable = factor(importance_usergiant$variable)
pdf(file = "featureImportance_RF_giantComp.pdf",  
    width = 6,
    height = 9)
importance_usergiant %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature importance - giant component')
dev.off()


##### topics ####

# this block of predictors does not improve prediction over network metrics from the giant component
# although some topics appear to be rather important
cols_BT = colnames(df)[grepl('_BT', colnames(df), fixed = T)]
cols_LT = colnames(df)[grepl('_LT', colnames(df), fixed = T)]
cols_topics = c(cols_BT, cols_LT)
cols_topics = cols_topics[!grepl('centrality', cols_topics, fixed = T)]
cols_topics = cols_topics[!grepl('communities', cols_topics, fixed = T)]
cols_topics = cols_topics[!grepl('betweenness', cols_topics, fixed = T)]
cols_topics = cols_topics[!grepl('closeness', cols_topics, fixed = T)]
cols_topics = cols_topics[!grepl('degree', cols_topics, fixed = T)]
cols_topics = cols_topics[!grepl('page_rank', cols_topics, fixed = T)]
districts = unique(df$district)
for (col_name in cols_topics) {
  for (district_id in districts) {
    print(col_name)
    print(district_id)
    tertiles = quantile(df[df$district == district_id, col_name], probs = c(0.33, 0.67), na.rm = T)
    print(tertiles)
    df[df$district == district_id & is.na(df[,col_name]),col_name] = 0
    if (sum(df[df$district == district_id, col_name]) != 0) {
      df[df$district == district_id & df[,col_name] > tertiles[2],col_name] = 3
      df[df$district == district_id & df[,col_name] > tertiles[1] & df[,col_name] <= tertiles[2],col_name] = 2
      df[df$district == district_id & df[,col_name] > 0 & df[,col_name] <= tertiles[1],col_name] = 1
    } else {
    }
  }
  df[,col_name] = ordered(
    df[,col_name], levels=c(0, 1, 2, 3),
    labels=c("not present", "lower third", "mid third", "upper third"))
}

var_expl_topics = NULL
importance_topics = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.topics = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality +
      communities_usergiant_leader + betweenness_usergiant + closeness_usergiant + eigen_centrality_usergiant + degree_usergiant + 
      page_rank_usergiant + dist_leader_usergiant +
      GINI_BT + cosine_distance_BT + legge_referendum_cittadini_BT + berlusconi_presidente_legge_BT +  politica_popolo_politici_BT + 
      blog_grillo_post_BT + italia_grecia_debito_BT + pd_pdl_pietro_BT + movimento_politica_stelle_BT + evasione_iva_fiscale_BT +
      nucleare_energia_centrali_BT + berlusconi_nord_mafia_BT + tv_rai_italia_BT + guerra_armi_pianeta_BT +
      GINI_LT + cosine_distance_LT + italia_nucleare_tav_LT + grecia_debito_euro_LT + stare_bambino_donne_LT +
      mafia_ciancimino_palermo_LT + cina_petrolio_pianeta_LT + parlamento_legge_pulito_LT + rifiuti_raccolta_ambientale_LT +
      pd_partito_sinistra_LT + processo_corte_tribunale_LT + berlusconi_legge_lodo_LT + partiti_cittadini_politica_LT +
      reato_prescrizione_processo_LT + giornali_pubblicita_rete_LT + de_magistris_genchi_LT + politica_cittadini_legge_LT +
      presidente_partito_senatore_LT + politica_espandi_comprimi_LT + berlusconi_fininvest_soldi_LT + euro_pensione_pagare_LT +
      stelle_movimento_lista_LT + soldi_banche_telecom_LT,
    data = df, importance = T, ntree = 500
  )
  var_expl_topics[i] = round(100 * model.topics$rsq[length(model.topics$rsq)], digits = 2)
  imp = as.data.frame(model.topics$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_topics = rbind(importance_topics,
                            imp)
}
rm(i, imp, model.topics)

avg_topics = mean(var_expl_topics)
sem_topics = sd(var_expl_topics) / sqrt(reps)
margin.error_topics <- t.score * sem_topics
print(c(avg_topics - margin.error_topics, avg_topics + margin.error_topics))
# 25.53443 25.69117
t.test(var_expl_usergiant, var_expl_topics)
# t = 17.184, df = 197.72, p-value < 2.2e-16
importance_topics$variable = factor(importance_topics$variable)
importance_topics %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature Importance - topics')



##### vanity metrics ####
vanity_cols = c("blog_comments_number", "blog_number_of_posts_commented", "forum_comments_dislikes",      
                "forum_comments_numReports", "forum_comments_likes", "forum_comments_number", "forum_thread_dislikes",    
                "forum_thread_postNumber", "forum_thread_likes", "forum_thread_score", "forum_thread_number", 
                "meetup_events_number", "days_on_meetup", "facebook_number_out_posts", "facebook_number_out_comments",
                "facebook_number_out_likes", "facebook_number_in_comments", "facebook_number_in_likes", "facebook_number_days_active")
districts = unique(df$district)
for (col_name in vanity_cols) {
  for (district_id in districts) {
    print(col_name)
    print(district_id)
    tertiles = quantile(df[df$district == district_id, col_name], probs = c(0.33, 0.67), na.rm = T)
    print(tertiles)
    df[df$district == district_id & is.na(df[,col_name]),col_name] = 0
    if (sum(df[df$district == district_id, col_name]) != 0) {
      df[df$district == district_id & df[,col_name] > tertiles[2],col_name] = 3
      df[df$district == district_id & df[,col_name] > tertiles[1] & df[,col_name] <= tertiles[2],col_name] = 2
      df[df$district == district_id & df[,col_name] > 0 & df[,col_name] <= tertiles[1],col_name] = 1
    } else {
    }
  }
  df[,col_name] = ordered(
    df[,col_name], levels=c(0, 1, 2, 3),
    labels=c("not present", "lower third", "mid third", "upper third"))
}
df$days_on_meetup[is.na(df$days_on_meetup)] <- 'not present'

var_expl_vanity = NULL
importance_vanity = NULL
for (i in 1:reps) {
  set.seed(seeds[i])
  model.vanity = randomForest(
    rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
      already_candidate +
      blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
      forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
      meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
      facebook_in_degree_centrality + facebook_out_degree_centrality +
      communities_usergiant_leader + betweenness_usergiant + closeness_usergiant + eigen_centrality_usergiant + degree_usergiant + 
      page_rank_usergiant + dist_leader_usergiant +
      blog_comments_number + blog_number_of_posts_commented + forum_comments_dislikes + forum_comments_numReports + forum_comments_likes + 
      forum_comments_number + forum_thread_dislikes + forum_thread_postNumber + forum_thread_likes + forum_thread_score + forum_thread_number + 
      facebook_number_in_comments + facebook_number_in_likes + facebook_number_days_active + meetup_events_number + facebook_number_out_posts + 
      facebook_number_out_comments + facebook_number_out_likes + days_on_meetup,
    data = df, importance = T, ntree = 500
  )
  var_expl_vanity[i] = round(100 * model.vanity$rsq[length(model.vanity$rsq)], digits = 2)
  imp = as.data.frame(model.vanity$importance)
  imp$iter = i
  imp$variable <- rownames(imp)
  rownames(imp) <- NULL
  importance_vanity = rbind(importance_vanity,
                            imp)
  # df$interfacePredictions = predict(model.interface, predict.all = T)
}
rm(i, imp, model.vanity)

avg_vanity = mean(var_expl_vanity)
sem_vanity = sd(var_expl_vanity) / sqrt(reps)
margin.error_vanity <- t.score * sem_vanity
print(c(avg_vanity - margin.error_vanity, avg_vanity + margin.error_vanity))
# 26.05679 26.19861
t.test(var_expl_usergiant, var_expl_vanity)
# 4.9961, df = 197.23, p-value = 1.285e-06
importance_vanity$variable = factor(importance_vanity$variable)
importance_vanity %>% ggplot(aes(y = fct_reorder(variable, `%IncMSE`, mean), x =`%IncMSE`)) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  ) +
  labs(x = 'percent increment in MSE', y = '', title = 'Feature Importance - vanity metrics')



##### analysis of predictions ####
model.usergiant = randomForest(
  rank.unit ~ age.district.centered + screenPosition.unit + sex + logo + district.size.log + f.perc + logo.perc +
    already_candidate +
    blog_degree_centrality + blog_eigenvector_centrality + blog_in_degree_centrality + blog_out_degree_centrality +
    forum_degree_centrality + forum_eigenvector_centrality + forum_in_degree_centrality + forum_out_degree_centrality +
    meetup_degree_centrality + meetup_eigenvector_centrality + facebook_degree_centrality + facebook_eigenvector_centrality + 
    facebook_in_degree_centrality + facebook_out_degree_centrality +
    communities_usergiant_leader + betweenness_usergiant + closeness_usergiant + eigen_centrality_usergiant + degree_usergiant + 
    page_rank_usergiant + dist_leader_usergiant,
  data = df, importance = T, ntree = 500
)
df$giantPredictions = predict(model.usergiant, predict.all = T)

# pdf(file = "predictedValues_meetup_eigen.pdf",  
#     width = 5,
#     height = 3.5)
# ggplot(data = df, aes(x=meetup_eigenvector_centrality, y=giantPredictions)) +
#   geom_boxplot() +
#   labs(x = 'Eigenvector centrality, MeetUp', y = 'predicted rank')
# dev.off()
# 
# pdf(file = "predictedValues_forum_eigen.pdf",  
#     width = 5,
#     height = 3.5)
# ggplot(data = df, aes(x=forum_eigenvector_centrality, y=giantPredictions)) +
#   geom_boxplot() +
#   labs(x = 'Eigenvector centrality, M5S forum', y = 'predicted rank')
# dev.off()
# 
# pdf(file = "predictedValues_facebook_outdegree.pdf",  
#     width = 5,
#     height = 3.5)
# ggplot(data = df, aes(x=facebook_out_degree_centrality, y=giantPredictions)) +
#   geom_boxplot() +
#   labs(x = 'Out degree centrality, Facebook', y = 'predicted rank')
# dev.off()
# 
# pdf(file = "predictedValues_community_giantComp.pdf",  
#     width = 5,
#     height = 3.5)
# ggplot(data = df, aes(x=communities_usergiant_leader, y=giantPredictions)) +
#   geom_boxplot() +
#   labs(x = 'In the same community as the party leader', y = 'predicted rank')
# dev.off()
# 
# pdf(file = "predictedValues_pageRank_giantComp.pdf",  
#     width = 5,
#     height = 3.5)
# ggplot(data = df, aes(x=page_rank_usergiant, y=giantPredictions)) +
#   geom_boxplot() +
#   labs(x = 'Page rank, giant component', y = 'predicted rank')
# dev.off()


# df$topicsPredictions = predict(model.topics, predict.all = T)
# ggplot(data = df, aes(x=legge_referendum_cittadini_BT, y=topicsPredictions)) +
#   geom_boxplot() +
#   labs(x = 'Topic: stelle, movimento, lista', y = 'predicted rank')


#### DON'T OVERWRITE ####
# save.image(file = "data/processed/pwdprotect_effect_on_primary.RData")

#### Table 3 ####

var_explained = 
  c(`interface` = avg_interface,
    `offline presence` = avg_offline,
    `online presence` = avg_onlinePresence,
    `platform centrality` = avg_platformCentrality,
    `user giant` = avg_usergiant,
    `topics` = avg_topics)

margin_error = 
  c(`interface` = c(avg_interface - margin.error_interface, 
                    avg_interface + margin.error_interface),
    `offline presence` = c(avg_offline - margin.error_offline, 
                           avg_offline + margin.error_offline),
    `online presence` = c(avg_onlinePresence - margin.error_onlinePresence, 
                          avg_onlinePresence + margin.error_onlinePresence),
    
    `platform centrality` = c(avg_platformCentrality - margin.error_platformCentrality, 
                              avg_platformCentrality + margin.error_platformCentrality),
    `user giant` = c(avg_usergiant - margin.error_usergiant, 
                     avg_usergiant + margin.error_usergiant),
    `topics` = c(avg_topics - margin.error_topics, 
                 avg_topics + margin.error_topics))


